Preparation

# install stuff that we need later
if (!require("DT")) install.packages('DT')
if (!require("ggplot2")) install.packages('ggplot2')
if (!require("tidyverse")) install.packages('tidyverse')
if (!require("hrbrthemes")) install.packages('hrbrthemes')
if (!require("dplyr")) install.packages('dplyr')

# Load stuff we need later
library(readr)
library(DT)
library(ggplot2)
library(tidyverse)
library(hrbrthemes)
library(dplyr)

# and set the working directory
setwd("~/projects/bbs-for-independence/03_workspace")

Import Data

# Read dataset summary from csv
dataset <- read_csv("../.tmp/dataset.csv", show_col_types = FALSE)
dataset$charratioDelta = dataset$charratioB - dataset$charratioA

Prepare Data

# Check the average of length, length_raw, avgcolumnsize, charratioA and charratioB
groups = aggregate(dataset[, c(4,5,6,7,8,13)], list(dataset$category), FUN = function(x) round(mean(suppressWarnings(as.numeric(as.character(x))), na.rm=TRUE), digits = 2))

cat("Anzahl Dateien: ", nrow(dataset))
## Anzahl Dateien:  107364
cat("Anzahl Kategorien: ", nrow(groups))
## Anzahl Kategorien:  46
groups %>% arrange(desc(charratioB))
Group.1 length length_raw avgcolumnsize charratioA charratioB charratioDelta
sex 25245.31 25689.20 66.57 0.73 0.95 0.22
digest 34519.19 34542.65 58.01 0.74 0.94 0.20
etext 299102.23 301868.73 56.19 0.72 0.94 0.22
law 29771.61 30244.16 61.56 0.72 0.94 0.22
politics 28007.38 28347.29 61.24 0.69 0.94 0.25
stories 27076.01 27511.67 64.07 0.71 0.94 0.23
news 12971.45 13154.94 79.36 0.73 0.93 0.20
occult 27980.03 28410.65 61.51 0.70 0.92 0.22
sf 38351.08 38987.91 56.19 0.69 0.92 0.22
survival 16287.49 16547.33 63.37 0.68 0.92 0.25
drugs 17921.99 18082.32 58.79 0.69 0.91 0.22
uploads 8816.45 8845.72 95.82 0.68 0.91 0.22
adventure 12030.71 12210.70 66.74 0.66 0.90 0.24
apple 17961.37 18178.18 71.89 0.63 0.90 0.26
conspiracy 21382.73 21588.19 58.61 0.69 0.90 0.21
food 9869.62 10089.01 48.61 0.66 0.90 0.24
fun 25376.78 25846.89 58.72 0.67 0.90 0.24
humor 13400.88 13677.52 52.81 0.68 0.90 0.22
rpg 41113.61 41716.81 56.43 0.67 0.90 0.24
anarchy 14598.59 14846.09 62.45 0.63 0.89 0.25
media 38898.56 39541.82 52.57 0.65 0.89 0.24
ufo 12556.73 12795.85 60.54 0.67 0.89 0.22
100 28605.01 28974.88 61.36 0.65 0.88 0.24
internet 44197.64 44858.94 54.36 0.66 0.88 0.22
games 18859.54 19132.80 60.54 0.63 0.87 0.25
groups 13316.24 13483.25 125.72 0.60 0.87 0.27
hacking 28084.39 28523.15 62.64 0.62 0.87 0.25
magazines 26874.41 27104.82 120.86 0.63 0.86 0.23
music 24969.30 25322.22 52.15 0.60 0.86 0.26
reports 11470.83 11667.04 67.95 0.62 0.86 0.24
virus 14520.71 14823.81 58.25 0.58 0.86 0.28
programming 37479.33 38219.20 55.93 0.58 0.85 0.28
computers 22598.36 23046.15 56.02 0.58 0.84 0.26
holiday 5166.49 5275.23 140.58 0.62 0.84 0.22
phreak 15324.95 15617.78 60.04 0.57 0.84 0.26
messages 41279.98 42040.92 50.41 0.61 0.82 0.21
hamradio 14198.19 14469.32 55.36 0.51 0.81 0.31
science 18272.69 18503.92 58.03 0.58 0.81 0.23
bbs 25039.61 25456.31 62.75 0.53 0.80 0.27
art 27190.70 27231.47 86.18 0.23 0.74 0.51
piracy 9269.16 9417.98 85.37 0.35 0.73 0.38
history 18394.00 18422.78 55.16 0.49 0.61 0.12
artifacts 27249.19 27378.12 820.73 0.39 0.48 0.09
exhibits 57330.67 57331.80 62.64 0.31 0.38 0.07
tap 227088.44 227092.95 35.22 0.25 0.28 0.03
floppies 986584.93 986598.72 34.40 0.25 0.27 0.03

Plots

Verhältnis Text (exkl. Satz- und Leerzeichen) zu Sonderzeichen

dataset %>%
  ggplot( aes(x=reorder(category, charratioA, FUN = median), 
              y=charratioA, group=category)) +
    geom_boxplot() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    geom_jitter(color="black", size=0.4, alpha=0.05) +
    stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
    coord_flip() + 
    ylim(0, 1) +
    xlab("Kategorie") +
    ylab("Verhältnis")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 91 rows containing missing values (geom_point).

Verhältnis Text (inkl. Satz- und Leerzeichen) zu Sonderzeichen

# create plot: charratioB
dataset %>%
  ggplot( aes(x=reorder(category, charratioB, FUN = median), 
              y=charratioB, group=category)) +
    geom_boxplot() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    geom_jitter(color="black", size=0.4, alpha=0.05) +
    stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
    coord_flip() + 
    ylim(0, 1) +
    xlab("Kategorie") +
    ylab("Verhältnis")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 76 rows containing missing values (geom_point).

Differenz beiden Verhältnissen (inkl. minus exkl. Satz- und Leerzeichen zu Sonderzeichen)

dataset %>%
  ggplot( aes(x=reorder(category, charratioA-charratioB, FUN = median), 
              y=charratioB-charratioA, group=category)) +
    geom_boxplot() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    geom_jitter(color="black", size=0.4, alpha=0.05) +
    stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
    coord_flip() + 
    ylim(0, 1) +
    xlab("Kategorie") +
    ylab("Differenz beiden Verhältnissen")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 249 rows containing missing values (geom_point).